#
###################################################################################################
# Survey Experience and Nonresponse in an Online Probability Panel: A Survival Analysis 
###################################################################################################
#
#### DATA PREPARATION ####
#
# Set working directory to the project root directory
remove()
setwd("~/Library/CloudStorage/OneDrive-LondonSchoolofEconomics/Katya/LSE/PhD/Paper 2/Data")
library(dplyr)
library(tidyr)
library(haven)
#
data0 <- readRDS("data0.rds")

# codebook, to delete later. 
library(tidyverse)
library(openxlsx)
library(Hmisc)

# Define survey missing codes
survey_missing_codes <- c(-9, -8, -7, NA)

# Simplify variable types
simplify_type <- function(x) {
  if (inherits(x, "haven_labelled")) return("factor")  # treat labelled as factor
  if (is.factor(x)) return("factor")
  if (is.character(x)) return("character")
  if (is.numeric(x)) return("numeric")
  class(x)[1]
}

# Function to get value labels in 'code: label' format
get_values <- function(x) {
  # Hmisc or haven labelled
  if (!is.null(attr(x, "labels"))) {
    lbls <- attr(x, "labels")
    paste0(lbls, ": ", names(lbls), collapse = "; ")
  } 
  # Factor
  else if (is.factor(x)) {
    paste0(seq_along(levels(x)), ": ", levels(x), collapse = "; ")
  } 
  # Character
  else if (is.character(x)) {
    paste0(seq_along(unique(x)), ": ", unique(x), collapse = "; ")
  } else {
    NA
  }
}

# Function to detect missing values including survey codes
get_missing <- function(x) {
  # combine NA with any defined survey missing codes present in the variable
  missing_vals <- unique(x[x %in% survey_missing_codes | is.na(x)])
  paste(missing_vals, collapse = ", ")
}

# Build the codebook
codebook <- tibble(
  variable = names(data0),
  label = sapply(data0, function(x) attr(x, "label")),
  type = sapply(data0, simplify_type),
  missing_values = sapply(data0, get_missing),
  values = sapply(data0, get_values)
)

# Export to Excel
write.xlsx(codebook, "data0_codebook.xlsx")

##### 1.1. New variable: sampled ####
#
data0 <- data0 %>%
  mutate(
    sampled_w1 = ifelse(PV1_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w2 = ifelse(PV2_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w3 = ifelse(PV3_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w4pew = ifelse(PV4_SampleStatus_Actual %in% c(1, 2), 1, 0), 
    sampled_w4eb = ifelse(PV4_SampleStatus_Actual %in% c(3), 1, 0),
    sampled_w5dft = ifelse(PV5_SampleStatus_Actual %in% c(1, 3), 1, 0), 
    sampled_w5eb = ifelse(PV5_SampleStatus_Actual %in% c(2), 1, 0),
    sampled_w6 = ifelse(PV6_SampleStatus_Actual %in% c(1, 2), 1, 0),
    sampled_w7 = ifelse(PV7_SampleStatus_Actual %in% c(1, 2), 1, 0), 
    sampled_w8dcms = ifelse(PV8_SampleStatus_Actual_DCMS %in% c(1), 1, 0), 
    sampled_w8eb = ifelse(PV8_SampleStatus_Actual_EB %in% c(1,2,3), 1, 0),
    sampled_w9ricu = ifelse(PV9_SampleStatus_Actual %in% c(1), 1, 0),
    sampled_w9pew = ifelse(PV9_SampleStatus_Actual %in% c(3), 1, 0),
    sampled_w10defra = ifelse(PV10_SampleStatus_Actual %in% c(1), 1, 0),
    sampled_w10eb = ifelse(PV10_SampleStatus_Actual %in% c(2,3), 1, 0),    
    sampled_w11 = ifelse(PV11_SampleStatus_Actual %in% c(1, 2), 1, 0),
    sampled_w12dcms = ifelse(PV12_SampleStatus_Actual %in% c(1, 2), 1, 0),
    sampled_w12eb = ifelse(PV12_SampleStatus_Actual %in% c(3, 4), 1, 0),
    sampled_w13 = ifelse(PV13_SampleStatus_Actual %in% c(1, 2), 1, 0),
    sampled_w14 = ifelse(PV14_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w15 = ifelse(PV15_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w16ofcom = ifelse(PV16_SampleStatus_Actual_Ofcom %in% c(1,2), 1, 0), 
    sampled_w16ofh = ifelse(PV16_SampleStatus_Actual_OFH %in% c(1,2), 1, 0),
    sampled_w18 = ifelse(PV18_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w19 = ifelse(PV19_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w20 = ifelse(PV20_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w21 = ifelse(PV21_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w22 = ifelse(PV22_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w23 = ifelse(PV23_SampleStatus_Actual %in% c(1), 1, 0), # same respondents sampled 3 consecutive times
    sampled_w23w2 = ifelse(PV23_SampleStatus_Actual %in% c(1), 1, 0), # same respondents sampled 3 consecutive times
    sampled_w23w3 = ifelse(PV23_SampleStatus_Actual %in% c(1), 1, 0), # same respondents sampled 3 consecutive times
    sampled_w24 = ifelse(PV24_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w25 = ifelse(PV25_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
    sampled_w27 = ifelse(PV27_SampleStatus_Actual %in% c(1, 2, 3, 4), 1, 0),
  )
#
#### 1.2. New variable: survey response ####
# Convert categorical columns to factor
categorical_columns <- c("PV1_Mode","PV2_Mode","PV3_ModeB","PV6_Mode","PV7_Mode","PV9_Mode","PV10_Mode","PV11_Mode","PV12_Mode",
                         "PV13_Mode","PV14_Mode","PV15_Mode","PV18_Mode","PV19_Mode","PV20_Mode","PV21_Mode","PV22_Mode","PV24_Mode","PV25_Mode","PV27_Mode",
                         "PV4_Mode_Pew", "PV4_Mode_EB", "PV5_Mode_DfT", "PV5_Mode_EB", "PV8_Mode_DCMS", "PV8_Mode_EB", "PV16_Mode_Ofcom", "PV16_Mode_OFH", 
                         "PV23_Mode", "PV23W2_Mode", "PV23W3_Mode")

data0 <- data0 %>%
  mutate(across(all_of(categorical_columns), as.factor))
#

# Define the columns to be used for creating new variables
columns_to_use <- c(
  "sampled_w1", "sampled_w2", "sampled_w3","sampled_w6","sampled_w7", "sampled_w11", "sampled_w13", "sampled_w14","sampled_w15", 
  "sampled_w18", "sampled_w19","sampled_w20", "sampled_w21", "sampled_w22", "sampled_w24","sampled_w25", "sampled_w27",
  "PV1_Mode","PV2_Mode","PV3_ModeB","PV6_Mode","PV7_Mode","PV11_Mode","PV13_Mode","PV14_Mode","PV15_Mode",
  "PV18_Mode","PV19_Mode","PV20_Mode","PV21_Mode","PV22_Mode","PV24_Mode","PV25_Mode","PV27_Mode"
)
#
# Define the new variable names
new_variable_names <- c("nonresponse_w1", "nonresponse_w2", "nonresponse_w3","nonresponse_w6","nonresponse_w7","nonresponse_w11","nonresponse_w13", "nonresponse_w14",
                        "nonresponse_w15", "nonresponse_w18","nonresponse_w19","nonresponse_w20", "nonresponse_w21", "nonresponse_w22", 
                        "nonresponse_w24","nonresponse_w25","nonresponse_w27")
#
# Create a loop to generate new variables
for (i in seq_along(new_variable_names)) {
  data0 <- data0 %>%
    mutate(
      !!new_variable_names[i] := case_when(
        (get(columns_to_use[i]) %in% c(1) & get(columns_to_use[i + 17]) %in% c(1, 2)) ~ as.numeric(0), # response 
        (get(columns_to_use[i]) %in% c(1) & get(columns_to_use[i + 17]) == 0) ~ as.numeric(1), # nonresponse as the event in survival analysis
        TRUE ~ NA_real_
      )
    )
}
#
# Coding for 'double' waves
compute_nonresponse <- function(sampled, mode) {
  case_when(
    sampled == 1 & mode %in% c(1, 2) ~ 0,  # response
    sampled == 1 & mode == 0 ~ 1,         # nonresponse
    sampled == 0 ~ NA_real_,              # not sampled
    TRUE ~ NA_real_                       # safety catch
  )
}
double_waves <- list(
  nonresponse_w4pew = c("sampled_w4pew", "PV4_Mode_Pew"),
  nonresponse_w4eb   = c("sampled_w4eb", "PV4_Mode_EB"),
  nonresponse_w5dft  = c("sampled_w5dft", "PV5_Mode_DfT"),
  nonresponse_w5eb   = c("sampled_w5eb", "PV5_Mode_EB"),
  nonresponse_w8dcms = c("sampled_w8dcms", "PV8_Mode_DCMS"),
  nonresponse_w8eb   = c("sampled_w8eb", "PV8_Mode_EB"),
  nonresponse_w9ricu = c("sampled_w9ricu", "PV9_Mode"),
  nonresponse_w9pew  = c("sampled_w9pew", "PV9_Mode"),
  nonresponse_w10defra = c("sampled_w10defra", "PV10_Mode"),
  nonresponse_w10eb    = c("sampled_w10eb", "PV10_Mode"),
  nonresponse_w12dcms  = c("sampled_w12dcms", "PV12_Mode"),
  nonresponse_w12eb    = c("sampled_w12eb", "PV12_Mode"),
  nonresponse_w16ofcom = c("sampled_w16ofcom", "PV16_Mode_Ofcom"),
  nonresponse_w16ofh   = c("sampled_w16ofh", "PV16_Mode_OFH"),
  nonresponse_w23      = c("sampled_w23", "PV23_Mode"),
  nonresponse_w23w2    = c("sampled_w23w2", "PV23W2_Mode"),
  nonresponse_w23w3    = c("sampled_w23w3", "PV23W3_Mode")
)
# Apply to data0
for (nm in names(double_waves)) {
  sampled <- double_waves[[nm]][1]
  mode    <- double_waves[[nm]][2]
  
  data0 <- data0 %>%
    mutate("{nm}" := compute_nonresponse(.data[[sampled]], .data[[mode]]))
}
#
##### 1.3. New variable: dummynotenjoy ####
# List of old and new variable names
old_vars <- c("PV1_SurveyEnjoyment", "PV2_SurveyEnjoyment", "PV3_SurveyEnjoyment","PV4_SurveyEnjoyment_Pew","PV4_SurveyEnjoyment_EB", 
              "PV5_SurveyEnjoyment_EB","PV6_SurveyEnjoyment","PV7_SurveyEnjoyment","PV8_SurveyEnjoyment_DCMS","PV8_SurveyEnjoyment_EB", "PV9_SurveyEnjoyment_Pew", "PV10_SurveyEnjoyment_Defra",
              "PV10_SurveyEnjoyment_EB","PV11_SurveyEnjoyment", "PV12_SurveyEnjoyment_DCMS", "PV12_SurveyEnjoyment_EB","PV15_SurveyEnjoyment", 
              "PV16_SurveyEnjoyment_Ofcom","PV16_SurveyEnjoyment_OFH","PV18_SurveyEnjoyment","PV20_SurveyEnjoyment", 
              "PV21_SurveyEnjoyment", "PV22_SurveyEnjoyment","PV23_SurveyEnjoyment", "PV23W2_SurveyEnjoyment","PV23W3_SurveyEnjoyment",
              "PV24_SurveyEnjoyment", "PV25_SurveyEnjoyment","PV27_SurveyEnjoyment"
              )
new_vars <- c("notenjoy_w1", "notenjoy_w2", "notenjoy_w3", "notenjoy_w4pew","notenjoy_w4eb",
              "notenjoy_w5eb","notenjoy_w6","notenjoy_w7", "notenjoy_w8dcms","notenjoy_w8eb", "notenjoy_w9pew", "notenjoy_w10defra",
              "notenjoy_w10eb","notenjoy_w11","notenjoy_w12dcms","notenjoy_w12eb", "notenjoy_w15", 
              "notenjoy_w16ofcom","notenjoy_w16ofh","notenjoy_w18","notenjoy_w20",
              "notenjoy_w21","notenjoy_w22","notenjoy_w23","notenjoy_w23w2","notenjoy_w23w3", 
              "notenjoy_w24", "notenjoy_w25","notenjoy_w27")

names(data0)[names(data0) %in% old_vars] <- new_vars
#
# Waves with Missing PV_SurveyEnjoyment
data0 <- data0 %>%
  mutate(
    notenjoy_w5dft = NA,
    notenjoy_w9ricu = NA,
    notenjoy_w13 = NA,
    notenjoy_w14 = NA,
    notenjoy_w19 = NA
  )
#
# dummy variable
vars_list <- c("notenjoy_w1", "notenjoy_w2", "notenjoy_w3", "notenjoy_w4pew","notenjoy_w4eb",
              "notenjoy_w5eb","notenjoy_w6","notenjoy_w7", "notenjoy_w8dcms","notenjoy_w8eb", "notenjoy_w9pew", "notenjoy_w10defra",
              "notenjoy_w10eb","notenjoy_w11","notenjoy_w12dcms","notenjoy_w12eb", "notenjoy_w15", 
              "notenjoy_w16ofcom","notenjoy_w16ofh","notenjoy_w18","notenjoy_w20",
              "notenjoy_w21","notenjoy_w22","notenjoy_w23","notenjoy_w23w2","notenjoy_w23w3", 
              "notenjoy_w24", "notenjoy_w25","notenjoy_w27")
#
# Define the recode_enjoyment function
recode_enjoyment <- function(x) {
  case_when(
    x == 1 ~ 0, #'enjoyed all of it' 
    x %in% c(2, 3) ~ 1, #'I enjoyed some of it but not all of it' or 'I did not enjoy any of it'
    TRUE ~ NA_real_
  )
}
#
# recoding to a dummy variable
data0 <- data0 %>%
  mutate(
    across(
      all_of(vars_list),
      ~ recode_enjoyment(.x),
      .names = "dummy{.col}"
    )
  )
#
# waves with missing survey enjoyment
data0 <- data0 %>%
  mutate(
    dummynotenjoy_w5dft = NA,
    dummynotenjoy_w9ricu = NA,
    dummynotenjoy_w13 = NA,
    dummynotenjoy_w14 = NA,
    dummynotenjoy_w19 = NA
  )
#
##### 1.4. New variable: invite date ####
# invite dates not available in raw data
# invite date was computed as first date of survey completion
# the dates were manually taken from data0 file as the first date of survey completion for each wave
# for PV1-4 and PV14-27 soft launch is used to record the earlier date. 
#
data0 <- data0 %>%
  mutate(
    invitedate_w1 = ifelse(PV1_SampleStatus_Actual == 1 & PV1_SL == 1, "2019-11-12",
                           ifelse(PV1_SampleStatus_Actual == 1 & PV1_SL == 0, "2019-11-22", NA))
  )
data0 <- data0 %>%
  mutate(
    invitedate_w4pew = ifelse(PV4_SampleStatus_Actual == 1 & PV4_SL == 0, "2020-07-08",
                        ifelse(PV4_SampleStatus_Actual == 2 & PV4_SL == 0, "2020-07-15",
                        ifelse(PV4_SampleStatus_Actual == 1 & PV4_SL == 1, "2020-07-08", NA))))

data0 <- data0 %>%
  mutate(
    invitedate_w4eb = ifelse(PV4_SampleStatus_Actual == 3, "2020-07-22", NA))

data0 <- data0 %>%
  mutate(
    invitedate_w5dft = ifelse(PV5_SampleStatus_Actual %in% c(1, 3), "2020-08-19", NA))  

data0 <- data0 %>%
  mutate(
    invitedate_w5eb = ifelse(PV5_SampleStatus_Actual == 2, "2020-08-24", NA))    

data0 <- data0 %>%
  mutate(
    invitedate_w7 = ifelse(PV7_SampleStatus_Actual == 1, "2020-11-23",
                                ifelse(PV7_SampleStatus_Actual == 2, "2020-11-30", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w12dcms = ifelse(PV12_SampleStatus_Actual == 1, "2021-06-21",
    ifelse(PV12_SampleStatus_Actual == 2, "2021-06-23", NA)))

data0 <- data0 %>%
  mutate(
    invitedate_w12eb = ifelse(PV12_SampleStatus_Actual == 3, "2021-07-08",
    ifelse(PV12_SampleStatus_Actual == 4, "2021-06-23", NA)))

data0 <- data0 %>%
  mutate(
    invitedate_w14 = ifelse(PV14_SampleStatus_Actual == 1, "2021-11-04",
                           ifelse(PV14_SampleStatus_Actual == 2, "2021-11-09", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w15 = ifelse(PV15_SampleStatus_Actual == 1 & PV15_SL == 1, "2022-01-20",
                            ifelse(PV15_SampleStatus_Actual == 1 & PV15_SL == 0, "2022-01-24", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w16ofcom = ifelse(PV16_SampleStatus_Actual_Ofcom == 1 & PV16_SL_Ofcom == 1, "2022-03-03",
    ifelse(PV16_SampleStatus_Actual_Ofcom == 1 & PV16_SL_Ofcom == 0, "2022-03-07",
    ifelse(PV16_SampleStatus_Actual_Ofcom == 2 & PV16_SL_Ofcom == 0, "2022-03-15", NA))))

data0 <- data0 %>%
  mutate(
    invitedate_w16ofh = ifelse(PV16_SampleStatus_Actual_OFH == 1 & PV16_SL_OFH == 1, "2022-03-29",
    ifelse(PV16_SampleStatus_Actual_OFH == 1 & PV16_SL_OFH == 0, "2022-03-24",
    ifelse(PV16_SampleStatus_Actual_OFH == 2 & PV16_SL_OFH == 0, "2022-04-11", NA))))
data0 <- data0 %>%
  mutate(
    invitedate_w18 = ifelse(PV18_SampleStatus_Actual == 1 & PV18_SL == 1, "2022-04-12",
    ifelse(PV18_SampleStatus_Actual == 1 & PV18_SL == 0, "2022-04-15",
    ifelse(PV18_SampleStatus_Actual == 2 & PV18_SL == 0, "2022-04-29", NA))))
data0 <- data0 %>%
  mutate(
    invitedate_w21 = ifelse(PV21_SampleStatus_Actual == 1 & PV21_SL == 1, "2022-07-05",
    ifelse(PV21_SampleStatus_Actual == 1 & PV21_SL == 0, "2022-07-04", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w22 = ifelse(PV22_SampleStatus_Actual == 1 & PV22_SL == 1, "2022-08-02",
    ifelse(PV22_SampleStatus_Actual == 1 & PV22_SL == 0, "2022-08-01", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w23 = ifelse(PV23_SampleStatus_Actual == 1 & PV23_SL == 1, "2022-08-26",
    ifelse(PV23_SampleStatus_Actual == 1 & PV23_SL == 0, "2022-08-25", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w23w2 = ifelse(PV23_SampleStatus_Actual == 1 & PV23_SL == 1, "2022-11-14",
    ifelse(PV23_SampleStatus_Actual == 1 & PV23_SL == 0, "2022-11-07", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w23w3 = ifelse(PV23_SampleStatus_Actual == 1 & PV23_SL == 1, "2023-02-22",
    ifelse(PV23_SampleStatus_Actual == 1 & PV23_SL == 0, "2023-02-20", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w24 = ifelse(PV24_SampleStatus_Actual == 1 & PV24_SL == 1, "2022-10-18",
    ifelse(PV24_SampleStatus_Actual == 1 & PV24_SL == 0, "2022-10-19", NA)))
data0 <- data0 %>%
  mutate(
    invitedate_w25 = ifelse(PV25_SampleStatus_Actual == 1 & PV25_SL == 1, "2022-11-28",
    ifelse(PV25_SampleStatus_Actual == 1 & PV25_SL == 0, "2022-11-30",
    ifelse(PV25_SampleStatus_Actual == 2 & PV25_SL == 0, "2022-12-19", NA))))
data0 <- data0 %>%
  mutate(
    invitedate_w27 = ifelse(PV27_SampleStatus_Actual == 1 & PV27_SL == 1, "2023-03-20",
    ifelse(PV27_SampleStatus_Actual == 1 & PV27_SL == 0, "2023-03-17",
    ifelse(PV27_SampleStatus_Actual == 2 & PV27_SL == 0, "2023-03-22", NA))))

# 'simple vars'
data0 <- data0 %>%
  mutate(
    invitedate_w2 = ifelse(PV2_SampleStatus_Actual == 1, "2020-03-04",NA))

data0 <- data0 %>%
  mutate(
    invitedate_w3 = ifelse(PV3_SampleStatus_Actual == 1, "2020-05-01", NA),
    invitedate_w6 = ifelse(PV6_SampleStatus_Actual %in% c(1,2), "2020-10-26", NA),
    invitedate_w8dcms = ifelse(PV8_SampleStatus_Actual_DCMS %in% c(1, 2), "2021-02-15", NA),
    invitedate_w8eb = ifelse(PV8_SampleStatus_Actual_EB %in% c(1, 2, 3), "2021-02-23", NA),
    invitedate_w9ricu = ifelse(PV9_SampleStatus_Actual == 1, "2021-03-09", NA),
    invitedate_w9pew = ifelse(PV9_SampleStatus_Actual == 3, "2021-03-09", NA),
    invitedate_w10defra = ifelse(PV10_SampleStatus_Actual == 1, "2021-04-07", NA),
    invitedate_w10eb = ifelse(PV10_SampleStatus_Actual %in% c(2, 3), "2021-04-16", NA),
    invitedate_w11 = ifelse(PV11_SampleStatus_Actual %in% c(1, 2), "2021-05-19", NA),
    invitedate_w13 = ifelse(PV13_SampleStatus_Actual == 1, "2021-10-20", NA),
    invitedate_w19 = ifelse(PV19_SampleStatus_Actual %in% c(1, 2, 3), "2022-04-04", NA),
    invitedate_w20 = ifelse(PV20_SampleStatus_Actual == 1, "2022-05-11", NA)
  )
#
#### 1.5. New variable: data collection date #####
# 
variable_mappings <- data.frame(
  new_variable = c("surveydate_w1", "surveydate_w2", "surveydate_w3", "surveydate_w4pew","surveydate_w4eb",
                     "surveydate_w5dft", "surveydate_w5eb","surveydate_w6", "surveydate_w7","surveydate_w8dcms",
                     "surveydate_w8eb", 
                     "surveydate_w11", "surveydate_w13","surveydate_w14",
                     "surveydate_w15", "surveydate_w16ofcom","surveydate_w16ofh", "surveydate_w18", "surveydate_w20",
                     "surveydate_w21", "surveydate_w22", "surveydate_w23","surveydate_w23w2", "surveydate_w23w3",
                     "surveydate_w24","surveydate_w25", "surveydate_w27"),
  old_variable = c("PV1_DataCollectionDate", "PV2_DataCollectionDate", "PV3_DataCollectionDate","PV4_DataCollectionDate_Pew", "PV4_DataCollectionDate_EB", 
                   "PV5_DataCollectionDate_DfT","PV5_DataCollectionDate_EB", "PV6_DataCollectionDate", "PV7_DataCollectionDate","PV8_DataCollectionDate_DCMS", 
                   "PV8_DataCollectionDate_EB",
                   "PV11_DataCollectionDate", "PV13_DataCollectionDate", "PV14_DataCollectionDate", 
                   "PV15_DataCollectionDate","PV16_DataCollectionDate_Ofcom", "PV16_DataCollectionDate_OFH", "PV18_DataCollectionDate","PV20_DataCollectionDate", 
                   "PV21_DataCollectionDate", "PV22_DataCollectionDate","PV23_DataCollectionDate", "PV23W2_DataCollectionDate", "PV23W3_DataCollectionDate",
                   "PV24_DataCollectionDate", "PV25_DataCollectionDate", "PV27_DataCollectionDate")
)
# Loop through each row in the mapping dataframe
for (i in 1:nrow(variable_mappings)) {
  new_var <- variable_mappings$new_variable[i]
  old_var <- variable_mappings$old_variable[i]
  
  # Add new variable to data1 with the same values as the old variable
  data0 <- data0 %>%
    mutate(!!new_var := .data[[old_var]])
}
#
# dates for double waves with a single original DataCollection variable
data0 <- data0 %>%
  mutate(surveydate_w9ricu = ifelse(sampled_w9ricu == 1, PV9_DataCollectionDate, NA))
data0 <- data0 %>%
  mutate(surveydate_w9pew = ifelse(sampled_w9pew == 1, PV9_DataCollectionDate, NA))
#
data0 <- data0 %>%
  mutate(surveydate_w10defra = ifelse(sampled_w10defra == 1, PV10_DataCollectionDate, NA))
data0 <- data0 %>%
  mutate(surveydate_w10eb = ifelse(sampled_w10eb == 1, PV10_DataCollectionDate, NA))
#
data0 <- data0 %>%
  mutate(surveydate_w12dcms = ifelse(sampled_w12dcms == 1, PV12_DataCollectionDate, NA))
data0 <- data0 %>%
  mutate(surveydate_w12eb = ifelse(sampled_w12eb == 1, PV12_DataCollectionDate, NA))
#
#### 1.6. New variable: length of survey ####
# List of old and new variable names
old <- c("PV1_LengthInMinutes", "PV2_LengthInMinutes", "PV3_LengthInMinutes", "PV4_LengthInMinutes_Pew", "PV4_LengthInMinutes_EB",
              "PV5_LengthInMinutes_DfT", "PV5_LengthInMinutes_EB", "PV6_LengthInMinutes", "PV7_LengthInMinutes", "PV8_LengthInMinutes_DCMS",
              "PV8_LengthInMinutes_EB", "PV11_LengthInMinutes",
              "PV13_LengthInMinutes", "PV14_LengthInMinutes", "PV15_LengthInMinutes", "PV16_LengthInMinutes_Ofcom", "PV16_LengthInMinutes_OFH",
              "PV18_LengthInMinutes", "PV20_LengthInMinutes", "PV21_LengthInMinutes", "PV22_LengthInMinutes", "PV23_LengthInMinutes",
              "PV23W2_LengthInMinutes", "PV23W3_LengthInMinutes", "PV24_LengthInMinutes", "PV25_LengthInMinutes", "PV27_LengthInMinutes"
)
new <- c("minutes_w1", "minutes_w2", "minutes_w3", "minutes_w4pew", "minutes_w4eb",
              "minutes_w5dft", "minutes_w5eb", "minutes_w6", "minutes_w7", "minutes_w8dcms",
              "minutes_w8eb", "minutes_w11",
              "minutes_w13", "minutes_w14", "minutes_w15", "minutes_w16ofcom", "minutes_w16ofh",
              "minutes_w18", "minutes_w20", "minutes_w21", "minutes_w22", "minutes_w23",
              "minutes_w23w2", "minutes_w23w3", "minutes_w24", "minutes_w25", "minutes_w27"
)

names(data0)[names(data0) %in% old] <- new
#
# minutes for double waves with a single original Minutes variable

data0 <- data0 %>%
  mutate(minutes_w9ricu = ifelse(sampled_w9ricu == 1, PV9_LengthInMinutes, NA))
data0 <- data0 %>%
  mutate(minutes_w9pew = ifelse(sampled_w9pew == 1, PV9_LengthInMinutes, NA))
#
data0 <- data0 %>%
  mutate(minutes_w10defra = ifelse(sampled_w10defra == 1, PV10_LengthInMinutes, NA))
data0 <- data0 %>%
  mutate(minutes_w10eb = ifelse(sampled_w10eb == 1, PV10_LengthInMinutes, NA))
#
data0 <- data0 %>%
  mutate(minutes_w12dcms = ifelse(sampled_w12dcms == 1, PV12_LengthInMinutes, NA))
data0 <- data0 %>%
  mutate(minutes_w12eb = ifelse(sampled_w12eb == 1, PV12_LengthInMinutes, NA))
# 
# w19 variables as NA due to missing data
data0 <- data0 %>%
  mutate(
    surveydate_w19 = NA,
    minutes_w19 = NA
  )
#
#### 1.7. New variable: survey length, excluding outliers ####
# 
# Outliers defined using 1.5*IQR rule

variables <- c("minutes_w1", "minutes_w2", "minutes_w3", "minutes_w4pew", "minutes_w4eb",
               "minutes_w5dft", "minutes_w5eb", "minutes_w6", "minutes_w7", "minutes_w8dcms",
               "minutes_w8eb", "minutes_w9ricu", "minutes_w9pew", "minutes_w10defra",
               "minutes_w10eb", "minutes_w11", "minutes_w12dcms", "minutes_w12eb", "minutes_w13",
               "minutes_w14", "minutes_w15", "minutes_w16ofcom", "minutes_w16ofh", "minutes_w18",
               "minutes_w19", "minutes_w20", "minutes_w21", "minutes_w22", "minutes_w23",
               "minutes_w23w2", "minutes_w23w3", "minutes_w24", "minutes_w25", "minutes_w27")

# Define the outlier cutoff
outlier_cutoff <- 1.5
# Function to remove outliers and create new variables and outlier flags
remove_outliers <- function(data, var, outlier_cutoff) {
  # Calculate quartiles and IQR
  Q1 <- quantile(data[[var]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[var]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
#  
# Define bounds
  lower_bound <- Q1 - outlier_cutoff * IQR
  upper_bound <- Q3 + outlier_cutoff * IQR
  
  # Create new variable names
  new_var <- paste0("nooutl", var)
  outlier_flag <- paste0("outliers_", var)
  
  # Identify outliers
  is_outlier <- ifelse(is.na(data[[var]]), NA, 
                       ifelse(data[[var]] < lower_bound | data[[var]] > upper_bound, 1, 0))
  
  # Create new variables
  data[[new_var]] <- ifelse(is_outlier == 1, NA, data[[var]])
  data[[outlier_flag]] <- is_outlier
  
  return(data)
}

# Apply the function to each variable
for (variable in variables) {
  data0 <- remove_outliers(data = data0, var = variable, outlier_cutoff = outlier_cutoff)
}

# Create a single 'outliers' variable indicating any outlier across all minutes variables
outlier_vars <- paste0("outliers_", variables)
data0$outliers <- as.integer(rowSums(data0[outlier_vars], na.rm = TRUE) > 0)

#### 1.8. New variable: median survey length ####
#
# Replace data0 with the actual name of your dataset
data0 <- data0 %>%
  mutate(
    median_w1 = median(minutes_w1, na.rm = TRUE),
    median_w2 = median(minutes_w2, na.rm = TRUE),
    median_w3 = median(minutes_w3, na.rm = TRUE),
    median_w4pew = median(minutes_w4pew, na.rm = TRUE),
    median_w4eb = median(minutes_w4eb, na.rm = TRUE),
    median_w5dft = median(minutes_w5dft, na.rm = TRUE),
    median_w5eb = median(minutes_w5eb, na.rm = TRUE),
    median_w6 = median(minutes_w6, na.rm = TRUE),
    median_w7 = median(minutes_w7, na.rm = TRUE),
    median_w8dcms = median(minutes_w8dcms, na.rm = TRUE),
    median_w8eb = median(minutes_w8eb, na.rm = TRUE),
    median_w9ricu = median(minutes_w9ricu, na.rm = TRUE),
    median_w9pew = median(minutes_w9pew, na.rm = TRUE),
    median_w10defra = median(minutes_w10defra, na.rm = TRUE),
    median_w10eb = median(minutes_w10eb, na.rm = TRUE),
    median_w11 = median(minutes_w11, na.rm = TRUE),
    median_w12dcms = median(minutes_w12dcms, na.rm = TRUE),
    median_w12eb = median(minutes_w12eb, na.rm = TRUE),
    median_w13 = median(minutes_w13, na.rm = TRUE),
    median_w14 = median(minutes_w14, na.rm = TRUE),
    median_w15 = median(minutes_w15, na.rm = TRUE),
    median_w16ofcom = median(minutes_w16ofcom, na.rm = TRUE),
    median_w16ofh = median(minutes_w16ofh, na.rm = TRUE),
    median_w18 = median(minutes_w18, na.rm = TRUE),
    median_w19 = median(minutes_w19, na.rm = TRUE),
    median_w20 = median(minutes_w20, na.rm = TRUE),
    median_w21 = median(minutes_w21, na.rm = TRUE),
    median_w22 = median(minutes_w22, na.rm = TRUE),
    median_w23 = median(minutes_w23, na.rm = TRUE),
    median_w23w2 = median(minutes_w23w2, na.rm = TRUE),
    median_w23w3 = median(minutes_w23w3, na.rm = TRUE),
    median_w24 = median(minutes_w24, na.rm = TRUE),
    median_w25 = median(minutes_w25, na.rm = TRUE),
    median_w27 = median(minutes_w27, na.rm = TRUE)
  ) %>%
  ungroup()

#### 1.9. New variable: nooutlmedian survey length (same as median survey length) ####
#
data0 <- data0 %>%
  mutate(
    nooutlmedian_w1 = median(nooutlminutes_w1, na.rm = TRUE),
    nooutlmedian_w2 = median(nooutlminutes_w2, na.rm = TRUE),
    nooutlmedian_w3 = median(nooutlminutes_w3, na.rm = TRUE),
    nooutlmedian_w4pew = median(nooutlminutes_w4pew, na.rm = TRUE),
    nooutlmedian_w4eb = median(nooutlminutes_w4eb, na.rm = TRUE),
    nooutlmedian_w5dft = median(nooutlminutes_w5dft, na.rm = TRUE),
    nooutlmedian_w5eb = median(nooutlminutes_w5eb, na.rm = TRUE),
    nooutlmedian_w6 = median(nooutlminutes_w6, na.rm = TRUE),
    nooutlmedian_w7 = median(nooutlminutes_w7, na.rm = TRUE),
    nooutlmedian_w8dcms = median(nooutlminutes_w8dcms, na.rm = TRUE),
    nooutlmedian_w8eb = median(nooutlminutes_w8eb, na.rm = TRUE),
    nooutlmedian_w9ricu = median(nooutlminutes_w9ricu, na.rm = TRUE),
    nooutlmedian_w9pew = median(nooutlminutes_w9pew, na.rm = TRUE),
    nooutlmedian_w10defra = median(nooutlminutes_w10defra, na.rm = TRUE),
    nooutlmedian_w10eb = median(nooutlminutes_w10eb, na.rm = TRUE),
    nooutlmedian_w11 = median(nooutlminutes_w11, na.rm = TRUE),
    nooutlmedian_w12dcms = median(nooutlminutes_w12dcms, na.rm = TRUE),
    nooutlmedian_w12eb = median(nooutlminutes_w12eb, na.rm = TRUE),
    nooutlmedian_w13 = median(nooutlminutes_w13, na.rm = TRUE),
    nooutlmedian_w14 = median(nooutlminutes_w14, na.rm = TRUE),
    nooutlmedian_w15 = median(nooutlminutes_w15, na.rm = TRUE),
    nooutlmedian_w16ofcom = median(nooutlminutes_w16ofcom, na.rm = TRUE),
    nooutlmedian_w16ofh = median(nooutlminutes_w16ofh, na.rm = TRUE),
    nooutlmedian_w18 = median(nooutlminutes_w18, na.rm = TRUE),
    nooutlmedian_w19 = median(nooutlminutes_w19, na.rm = TRUE),
    nooutlmedian_w20 = median(nooutlminutes_w20, na.rm = TRUE),
    nooutlmedian_w21 = median(nooutlminutes_w21, na.rm = TRUE),
    nooutlmedian_w22 = median(nooutlminutes_w22, na.rm = TRUE),
    nooutlmedian_w23 = median(nooutlminutes_w23, na.rm = TRUE),
    nooutlmedian_w23w2 = median(nooutlminutes_w23w2, na.rm = TRUE),
    nooutlmedian_w23w3 = median(nooutlminutes_w23w3, na.rm = TRUE),
    nooutlmedian_w24 = median(nooutlminutes_w24, na.rm = TRUE),
    nooutlmedian_w25 = median(nooutlminutes_w25, na.rm = TRUE),
    nooutlmedian_w27 = median(nooutlminutes_w27, na.rm = TRUE)
  ) %>%
  ungroup()

#
#### 1.10. New variable: mode of completion  ####
# code 0 = No response, code 1 = Online, code 2 = Telephone
# 
# List of old and new variable names
old_mode <- c("PV1_Mode", "PV2_Mode", "PV3_ModeB", "PV4_Mode_Pew", "PV4_Mode_EB", "PV5_Mode_DfT",
              "PV5_Mode_EB", "PV6_Mode", "PV7_Mode", "PV8_Mode_DCMS", "PV8_Mode_EB", "PV11_Mode", "PV13_Mode", "PV14_Mode", 
              "PV15_Mode", "PV16_Mode_Ofcom", "PV16_Mode_OFH", "PV18_Mode", "PV19_Mode", "PV20_Mode", "PV21_Mode",
              "PV22_Mode", "PV23_Mode", "PV23W2_Mode", "PV23W3_Mode", "PV24_Mode", "PV25_Mode", "PV27_Mode"
)
new_mode <- c("mode_w1", "mode_w2", "mode_w3", "mode_w4pew", "mode_w4eb","mode_w5dft", 
              "mode_w5eb", "mode_w6", "mode_w7", "mode_w8dcms","mode_w8eb", "mode_w11", "mode_w13", "mode_w14", 
              "mode_w15", "mode_w16ofcom", "mode_w16ofh","mode_w18", "mode_w19","mode_w20", "mode_w21", 
              "mode_w22", "mode_w23","mode_w23w2", "mode_w23w3", "mode_w24", "mode_w25", "mode_w27"
)
names(data0)[names(data0) %in% old_mode] <- new_mode
#
# mode for double waves with a single original Mode variable
library(purrr)
# Helper function to recode mode
recode_mode <- function(sampled, mode) {
  case_when(
    sampled == 1 & mode == 1 ~ 0,
    sampled == 1 & mode == 2 ~ 1,
    sampled == 1 & mode == 0 ~ NA_real_,
    TRUE ~ NA_real_
  )
}
# List of new variable names -> corresponding sampled and mode columns
waves <- list(
  mode_w9ricu     = c("sampled_w9ricu", "PV9_Mode"),
  mode_w9pew      = c("sampled_w9pew", "PV9_Mode"),
  mode_w10defra   = c("sampled_w10defra", "PV10_Mode"),
  mode_w10eb      = c("sampled_w10eb", "PV10_Mode"),
  mode_w12dcms    = c("sampled_w12dcms", "PV12_Mode"),
  mode_w12eb      = c("sampled_w12eb", "PV12_Mode")
)

# Apply helper function to all waves
for (nm in names(waves)) {
  sampled <- waves[[nm]][1]
  mode <- waves[[nm]][2]
  
  data0 <- data0 %>%
    mutate("{nm}" := recode_mode(.data[[sampled]], .data[[mode]]))
}
# 
# change Mode into a dummy (1 = telephone, 0 = online, NA = no response)
vars_list <- c("mode_w1", "mode_w2", "mode_w3", "mode_w4pew", "mode_w4eb","mode_w5dft", 
               "mode_w5eb", "mode_w6", "mode_w7", "mode_w8dcms","mode_w8eb",
               "mode_w11", "mode_w13", "mode_w14", 
               "mode_w15", "mode_w16ofcom", "mode_w16ofh","mode_w18", "mode_w19","mode_w20", "mode_w21", 
               "mode_w22", "mode_w23","mode_w23w2", "mode_w23w3", "mode_w24", "mode_w25", "mode_w27")

# Define the recode_mode function
recode_mode <- function(x) {
  case_when(
    x == 2 ~ 1, # telephone mode 
    x == 1 ~ 0, # online mode
    TRUE ~ NA_real_
  )
}
data0 <- data0 %>%
  mutate(across(all_of(vars_list), recode_mode))

# save as data1
saveRDS(data0, "data1.rds")
#
##### 1.11. Create a reduced version of the dataset ####
data1 <- readRDS("data1.rds")
# renaming variables
data1 <- data1 %>%
  rename(
    phonemode_w1 = mode_w1,
    phonemode_w2 = mode_w2,
    phonemode_w3 = mode_w3,
    phonemode_w4pew = mode_w4pew,
    phonemode_w4eb = mode_w4eb,
    phonemode_w5dft = mode_w5dft,
    phonemode_w5eb = mode_w5eb,
    phonemode_w6 = mode_w6,
    phonemode_w7 = mode_w7,
    phonemode_w8dcms = mode_w8dcms,
    phonemode_w8eb = mode_w8eb,
    phonemode_w9ricu = mode_w9ricu,
    phonemode_w9pew = mode_w9pew,
    phonemode_w10defra = mode_w10defra,
    phonemode_w10eb = mode_w10eb,
    phonemode_w11 = mode_w11,
    phonemode_w12dcms = mode_w12dcms,
    phonemode_w12eb = mode_w12eb,
    phonemode_w13 = mode_w13,
    phonemode_w14 = mode_w14,
    phonemode_w15 = mode_w15,
    phonemode_w16ofcom = mode_w16ofcom,
    phonemode_w16ofh = mode_w16ofh,
    phonemode_w18 = mode_w18,
    phonemode_w19 = mode_w19,
    phonemode_w20 = mode_w20,
    phonemode_w21 = mode_w21,
    phonemode_w22 = mode_w22,
    phonemode_w23 = mode_w23,
    phonemode_w23w2 = mode_w23w2,
    phonemode_w23w3 = mode_w23w3,
    phonemode_w24 = mode_w24,
    phonemode_w25 = mode_w25,
    phonemode_w27 = mode_w27
  )

# create a short version 'data2'
data2 <- data1 %>%
  select(
    InternalSerialNumber, RS_Mode, RS_Sex, SF_SampleSource,
    WeightStratum, 
    sampled_w1, sampled_w2, sampled_w3, sampled_w4pew,
    sampled_w4eb, sampled_w5dft, sampled_w5eb, sampled_w6, sampled_w7, sampled_w8dcms,
    sampled_w8eb, sampled_w9ricu, sampled_w9pew, sampled_w10defra, sampled_w10eb, sampled_w11,
    sampled_w12dcms, sampled_w12eb, sampled_w13, sampled_w14, sampled_w15, sampled_w16ofcom,
    sampled_w16ofh, sampled_w18, sampled_w19, sampled_w20, sampled_w21, sampled_w22,
    sampled_w23, sampled_w23w2, sampled_w23w3, sampled_w24, sampled_w25, sampled_w27,
    invitedate_w1, invitedate_w2, invitedate_w3, invitedate_w4pew, invitedate_w4eb, invitedate_w5dft,
    invitedate_w5eb, invitedate_w6, invitedate_w7, invitedate_w8dcms, invitedate_w8eb, invitedate_w9ricu,
    invitedate_w9pew, invitedate_w10defra, invitedate_w10eb, invitedate_w11, invitedate_w12dcms, invitedate_w12eb,
    invitedate_w13, invitedate_w14, invitedate_w15, invitedate_w16ofcom, invitedate_w16ofh, invitedate_w18,
    invitedate_w19, invitedate_w20, invitedate_w21, invitedate_w22, invitedate_w23, invitedate_w23w2,
    invitedate_w23w3, invitedate_w24, invitedate_w25, invitedate_w27, 
    nonresponse_w1, nonresponse_w2,
    nonresponse_w3, nonresponse_w4pew, nonresponse_w4eb, nonresponse_w5dft, nonresponse_w5eb, nonresponse_w6,
    nonresponse_w7, nonresponse_w8dcms, nonresponse_w8eb, nonresponse_w9ricu, nonresponse_w9pew, nonresponse_w10defra,
    nonresponse_w10eb, nonresponse_w11, nonresponse_w12dcms, nonresponse_w12eb, nonresponse_w13, nonresponse_w14,
    nonresponse_w15, nonresponse_w16ofcom, nonresponse_w16ofh, nonresponse_w18, nonresponse_w19, nonresponse_w20,
    nonresponse_w21, nonresponse_w22, nonresponse_w23, nonresponse_w23w2, nonresponse_w23w3, nonresponse_w24,
    nonresponse_w25, nonresponse_w27, 
    surveydate_w1, surveydate_w2, surveydate_w3, surveydate_w4pew,
    surveydate_w4eb, surveydate_w5dft, surveydate_w5eb, surveydate_w6, surveydate_w7, surveydate_w8dcms,
    surveydate_w8eb, surveydate_w9ricu, surveydate_w9pew, surveydate_w10defra, surveydate_w10eb, surveydate_w11,
    surveydate_w12dcms, surveydate_w12eb, surveydate_w13, surveydate_w14, surveydate_w15, surveydate_w16ofcom,
    surveydate_w16ofh, surveydate_w18, surveydate_w19, surveydate_w20, surveydate_w21, surveydate_w22,
    surveydate_w23, surveydate_w23w2, surveydate_w23w3, surveydate_w24, surveydate_w25, surveydate_w27,
    notenjoy_w1, notenjoy_w2, notenjoy_w3, notenjoy_w4pew, notenjoy_w4eb, notenjoy_w5dft,
    notenjoy_w5eb, notenjoy_w6, notenjoy_w7, notenjoy_w8dcms, notenjoy_w8eb, notenjoy_w9ricu,
    notenjoy_w9pew, notenjoy_w10defra, notenjoy_w10eb, notenjoy_w11, notenjoy_w12dcms, notenjoy_w12eb,
    notenjoy_w13, notenjoy_w14, notenjoy_w15, notenjoy_w16ofcom, notenjoy_w16ofh, notenjoy_w18,
    notenjoy_w19, notenjoy_w20, notenjoy_w21, notenjoy_w22, notenjoy_w23, notenjoy_w23w2,
    notenjoy_w23w3, notenjoy_w24, notenjoy_w25, notenjoy_w27, 
    dummynotenjoy_w1, dummynotenjoy_w2,
    dummynotenjoy_w3, dummynotenjoy_w4pew, dummynotenjoy_w4eb, dummynotenjoy_w5dft, dummynotenjoy_w5eb, dummynotenjoy_w6,
    dummynotenjoy_w7, dummynotenjoy_w8dcms, dummynotenjoy_w8eb, dummynotenjoy_w9ricu, dummynotenjoy_w9pew,
    dummynotenjoy_w10defra, dummynotenjoy_w10eb, dummynotenjoy_w11, dummynotenjoy_w12dcms, dummynotenjoy_w12eb,
    dummynotenjoy_w13, dummynotenjoy_w14, dummynotenjoy_w15, dummynotenjoy_w16ofcom, dummynotenjoy_w16ofh,
    dummynotenjoy_w18, dummynotenjoy_w19, dummynotenjoy_w20, dummynotenjoy_w21, dummynotenjoy_w22,
    dummynotenjoy_w23, dummynotenjoy_w23w2, dummynotenjoy_w23w3, dummynotenjoy_w24, dummynotenjoy_w25,
    dummynotenjoy_w27, 
    minutes_w1, minutes_w2, minutes_w3, minutes_w4pew, minutes_w4eb, minutes_w5dft,
    minutes_w5eb, minutes_w6, minutes_w7, minutes_w8dcms, minutes_w8eb, minutes_w9ricu,
    minutes_w9pew, minutes_w10defra, minutes_w10eb, minutes_w11, minutes_w12dcms, minutes_w12eb,
    minutes_w13, minutes_w14, minutes_w15, minutes_w16ofcom, minutes_w16ofh, minutes_w18,
    minutes_w19, minutes_w20, minutes_w21, minutes_w22, minutes_w23, minutes_w23w2,
    minutes_w23w3, minutes_w24, minutes_w25, minutes_w27, 
    nooutlminutes_w1, nooutlminutes_w2,
    nooutlminutes_w3, nooutlminutes_w4pew, nooutlminutes_w4eb, nooutlminutes_w5dft, nooutlminutes_w5eb,
    nooutlminutes_w6, nooutlminutes_w7, nooutlminutes_w8dcms, nooutlminutes_w8eb, nooutlminutes_w9ricu,
    nooutlminutes_w9pew, nooutlminutes_w10defra, nooutlminutes_w10eb, nooutlminutes_w11, nooutlminutes_w12dcms,
    nooutlminutes_w12eb, nooutlminutes_w13, nooutlminutes_w14, nooutlminutes_w15, nooutlminutes_w16ofcom,
    nooutlminutes_w16ofh, nooutlminutes_w18, nooutlminutes_w19, nooutlminutes_w20, nooutlminutes_w21,
    nooutlminutes_w22, nooutlminutes_w23, nooutlminutes_w23w2, nooutlminutes_w23w3, nooutlminutes_w24,
    nooutlminutes_w25, nooutlminutes_w27, 
    median_w1, median_w2, median_w3, median_w4pew, median_w4eb, median_w5dft, median_w5eb, median_w6, median_w7,
    median_w8dcms, median_w8eb, median_w9ricu, median_w9pew, median_w10defra, median_w10eb, median_w11, median_w12dcms,
    median_w12eb, median_w13, median_w14, median_w15, median_w16ofcom, median_w16ofh, median_w18, median_w19,
    median_w20, median_w21, median_w22, median_w23, median_w23w2, median_w23w3, median_w24, median_w25, median_w27,
    nooutlmedian_w1, nooutlmedian_w2, nooutlmedian_w3, nooutlmedian_w4pew, nooutlmedian_w4eb, nooutlmedian_w5dft,
    nooutlmedian_w5eb, nooutlmedian_w6, nooutlmedian_w7, nooutlmedian_w8dcms, nooutlmedian_w8eb, nooutlmedian_w9ricu,
    nooutlmedian_w9pew, nooutlmedian_w10defra, nooutlmedian_w10eb, nooutlmedian_w11, nooutlmedian_w12dcms, nooutlmedian_w12eb,
    nooutlmedian_w13, nooutlmedian_w14, nooutlmedian_w15, nooutlmedian_w16ofcom, nooutlmedian_w16ofh, nooutlmedian_w18,
    nooutlmedian_w19, nooutlmedian_w20, nooutlmedian_w21, nooutlmedian_w22, nooutlmedian_w23, nooutlmedian_w23w2,
    nooutlmedian_w23w3, nooutlmedian_w24, nooutlmedian_w25, nooutlmedian_w27, 
    phonemode_w1, phonemode_w2, phonemode_w3, phonemode_w4pew, phonemode_w4eb,
    phonemode_w5dft, phonemode_w5eb, phonemode_w6, phonemode_w7, phonemode_w8dcms, phonemode_w8eb,
    phonemode_w9ricu, phonemode_w9pew, phonemode_w10defra, phonemode_w10eb, phonemode_w11, phonemode_w12dcms,
    phonemode_w12eb, phonemode_w13, phonemode_w14, phonemode_w15, phonemode_w16ofcom, phonemode_w16ofh,
    phonemode_w18, phonemode_w19, phonemode_w20, phonemode_w21, phonemode_w22, phonemode_w23,
    phonemode_w23w2, phonemode_w23w3, phonemode_w24, phonemode_w25, phonemode_w27
)
#
saveRDS(data2, "data2.rds")
#
##### 1.12. Merge in variables from the Recruitment Survey #####
#
# Read in the original data files from Kantar
library(haven)
recruits <- read_sav("RS_allforLSE.sav")

recruits_subset <- recruits %>%
  select(InternalSerialNumber, RS_DataCollectionDate, RS_Age_TenYearGroup, RS_EducationLevel, RS_Degree
         , RS_Weight
         , RS_CohabitationStatus_Binary, RS_HousingTenure_Reduced, RS_UKBirthCountry, RS_BirthCountry
         , RS_EthnicGroupReduced, RS_WorkingStatus_Binary, RS_DisabilityStatus 
         , SF_IMDDecile, SF_Region_N
         , RS_BFI_2_XS_ExtraversionScore, RS_BFI_2_XS_AgreeablenessScore, RS_BFI_2_XS_ConscientiousnessScore           
         , RS_BFI_2_XS_NegativeEmotionalityScore, RS_BFI_2_XS_OpenMindednessScore
         , RS_PoliticalEngagement)
data2 <- readRDS("data2.rds")
data3 <- data2 %>%
  left_join(recruits_subset, by = "InternalSerialNumber")
saveRDS(data3, "data3.rds")
#
#### 1.13. SF_SampleSource into dummy ####
data4 <- readRDS("data3.rds")
data4 <- data4 %>%
  mutate(abos = case_when(
    SF_SampleSource == 1 ~ 1, # recruited using address based sampling or push-to-web
    SF_SampleSource == 0 ~ 0, # recruited face-to-face
    TRUE ~ NA_real_
  ))
#
#### 1.14. RS_Sex into dummy ####
#
data4 <- data4 %>%
  mutate(Male = case_when(
    RS_Sex == 1 ~ 1,
    RS_Sex == 2 ~ 0,
    TRUE ~ NA_real_
  ))
#
#### 1.15. RS_Age group into dummy ####
#
data4$RS_Age_TenYearGroup_factor <- factor(data4$RS_Age_TenYearGroup)

data4 <- data4 %>%
  mutate(
    age16_24 = ifelse(RS_Age_TenYearGroup_factor == 1, 1, 0),
    age25_34 = ifelse(RS_Age_TenYearGroup_factor == 2, 1, 0),
    age35_44 = ifelse(RS_Age_TenYearGroup_factor == 3, 1, 0),
    age45_54 = ifelse(RS_Age_TenYearGroup_factor == 4, 1, 0),
    age55_64 = ifelse(RS_Age_TenYearGroup_factor == 5, 1, 0),
    age65_74 = ifelse(RS_Age_TenYearGroup_factor == 6, 1, 0),
    age75plus = ifelse(RS_Age_TenYearGroup_factor == 7, 1, 0)
  )
#
#### 1.16. Disability variable - change into dummy ####
#
data4 <- data4 %>%
  mutate(
    disability = case_when(
      RS_DisabilityStatus == 4 ~ 0, #'No' 
      RS_DisabilityStatus %in% c(1, 2, 3) ~ 1, #'Yes'
      TRUE ~ NA_real_
    )
  )
#
#### 1.17. RS_BirthCountry - change into dummy ####
data4 <- data4 %>%
  mutate(
    UKborn = case_when(
      RS_UKBirthCountry == 5 ~ 0, #'Not born in the UK' 
      RS_UKBirthCountry %in% c(1, 2, 3, 4) ~ 1, #'UK country'
      TRUE ~ NA_real_
    )
  )
#
#### 1.18. RS_HousingTenure_Reduced into dummies ####
data4 <- data4 %>%
  mutate(
    housing_tenure = case_when(
      RS_HousingTenure_Reduced == 3 ~ 0, #'Rent/other' 
      RS_HousingTenure_Reduced %in% c(1, 2) ~ 1, #'Own outright or on mortgage'
      TRUE ~ NA_real_
    )
  )
#
#### 1.19. RS_EthnicGroupReduced ####
data4 <- data4 %>%
  mutate(
    White = case_when(
      RS_EthnicGroupReduced %in% c(3, 4) ~ 0, #'South Asian or Other'
      RS_EthnicGroupReduced %in% c(1, 2) ~ 1, #'White and white other'
      TRUE ~ NA_real_
    )
  )
#
#### 1.20. Region (London as the reference category) ####
data4$NE_England <- ifelse(data4$SF_Region_N==1,1,0)
data4$NW_England <- ifelse(data4$SF_Region_N==2,1,0)
data4$Yorkshire_Humberside <- ifelse(data4$SF_Region_N==3,1,0)
data4$E_Midlands <- ifelse(data4$SF_Region_N==4,1,0)
data4$W_Midlands <- ifelse(data4$SF_Region_N==5,1,0)
data4$Eastof_England <- ifelse(data4$SF_Region_N==6,1,0)
data4$SE_England <- ifelse(data4$SF_Region_N==8,1,0)
data4$SW_England <- ifelse(data4$SF_Region_N==9,1,0)
data4$NI <- ifelse(data4$SF_Region_N==10,1,0)
data4$Scotland <- ifelse(data4$SF_Region_N==11,1,0)
data4$Wales <- ifelse(data4$SF_Region_N==12,1,0)
#
#### 1.21.  Political engagement re-coding ####
data4$follows_politics <- ifelse(data4$RS_PoliticalEngagement %in% c(1, 2), 1,
                         ifelse(data4$RS_PoliticalEngagement %in% c(3, 4), 0, NA))
#
# Save cleaned cross-sectional data for reuse
saveRDS(data4, "data4.rds")
#
#### 2. PANEL DATA LONG FORMATTING ####
#
# Reload cleaned data
data4 <- readRDS("data4.rds")
data4 <- data4 %>%
  select(-age16_24,-RS_Age_TenYearGroup_factor, -RS_HousingTenure_Reduced, -RS_UKBirthCountry, -RS_BirthCountry, -RS_EthnicGroupReduced, -RS_DisabilityStatus, -SF_Region_N, -RS_PoliticalEngagement)

#### 2.1. Long format commands ####
library(splitstackshape)
data4_stacklong <- splitstackshape::merged.stack(data4,
                                                 id.vars = c('InternalSerialNumber','RS_Mode','SF_SampleSource','RS_Sex'
                                                             ,'WeightStratum','RS_DataCollectionDate','RS_Age_TenYearGroup'
                                                             , "disability", "UKborn", 'housing_tenure', 'RS_WorkingStatus_Binary', 'RS_CohabitationStatus_Binary', 'White'
                                                             ,'NE_England', 'NW_England', 'Yorkshire_Humberside', 'E_Midlands', 'W_Midlands', 'Eastof_England' 
                                                             ,'SE_England', 'SW_England', 'NI', 'Scotland', 'Wales', 'SF_IMDDecile'     
                                                             ,'RS_EducationLevel','RS_Degree','RS_Weight','RS_BFI_2_XS_ExtraversionScore','RS_BFI_2_XS_AgreeablenessScore','RS_BFI_2_XS_ConscientiousnessScore'
                                                             ,'RS_BFI_2_XS_NegativeEmotionalityScore','RS_BFI_2_XS_OpenMindednessScore'                     
                                                             ,'abos','Male','age25_34','age35_44','age45_54','age55_64','age65_74','age75plus','follows_politics'),
                                                 var.stubs = c("sampled","invitedate","nonresponse","surveydate","notenjoy","dummynotenjoy","phonemode","minutes", "nooutlminutes", "median", "nooutlmedian"),
                                                 sep = "_",
                                                 keep.all = TRUE
)

data4_stacklong <- data4_stacklong %>% 
  arrange(InternalSerialNumber, invitedate)
#
#### 2.2. New variable: ind_sequence (time) ####
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, invitedate) %>%
  group_by(InternalSerialNumber) %>%
  mutate(temp_sequence = cumsum(sampled == 1)) %>%
  mutate(ind_sequence = if_else(sampled == 1, temp_sequence, as.integer(NA))) %>%
  select(-temp_sequence) %>% # Remove the temporary sequence column
  arrange(InternalSerialNumber, ind_sequence)
#
# delete all cases where ind_sequence=NA as these will not be included in the analysis. 
data4_stacklong <- data4_stacklong %>%
  filter(!is.na(ind_sequence))
#
#### 2.3. New variable: 'y' (outcome = time to event) ####
#
mark_first_nonresponse <- function(df) {
  # Find the first occurrence of nonresponse == 1
  first_nonresponse <- which(df$nonresponse == 1)[1]
  
  # Create a new column 'y', default to 0
  df$y <- 0
  
  # If there is a nonresponse, set 'y' to 1 for that row
  if (!is.na(first_nonresponse)) {
    df$y[first_nonresponse] <- 1
  }
  
  return(df)
}

# Group by InternalSerialNumber and apply the function
data4_stacklong <- data4_stacklong %>% 
  group_by(InternalSerialNumber) %>%
  group_modify(~ mark_first_nonresponse(.))

# Ungroup the data
data4_stacklong <- ungroup(data4_stacklong)
#
#### 2.4. Delete records after the event 'y' ####
# 
# Mark the records that need to be deleted within unit-periods after the event has occurred (y=1) and unit-periods with no events get code 0 at 'delete'.
# Function to apply to each group for creating 'delete' variable
mark_delete_post_y <- function(df) {
  # Check if all 'y' values are 0
  if (all(df$y == 0)) {
    # If all 'y' values are 0, then set all 'delete' values to 0
    df$delete <- 0
  } else {
    # Initialize 'delete' column to 0
    df$delete <- 0
    
    # Find the index where y == 1
    y_index <- which(df$y == 1)
    
    # If 'y' is 1 for a row, set 'delete' to 1 for all subsequent rows
    if (length(y_index) > 0 && y_index < nrow(df)) {
      df$delete[(y_index + 1):nrow(df)] <- 1
    }
  }
  
  return(df)
}

# Group by InternalSerialNumber and apply the function
data4_stacklong <- data4_stacklong %>% 
  group_by(InternalSerialNumber) %>%
  group_modify(~ mark_delete_post_y(.x))
#
# Ungroup the data
data4_stacklong <- ungroup(data4_stacklong)
#
# delete all records after event
data4_stacklong <- data4_stacklong %>%
  filter(delete != 1)
#
##### 2.5. New variable: prev_dummynotenjoy #####
#
# Arrange data by InternalSerialNumber and ind_sequence
data4_stacklong <- data4_stacklong %>% 
  arrange(InternalSerialNumber, ind_sequence)

# Replace NA values in dummynotenjoy with 99
data4_stacklong <- data4_stacklong %>%
  mutate(dummynotenjoy = ifelse(is.na(dummynotenjoy), 99, dummynotenjoy))

# Create prev_dummynotenjoy
data4_stacklong <- data4_stacklong %>%
  group_by(InternalSerialNumber) %>%
  mutate(prev_dummynotenjoy = lag(dummynotenjoy))

# Replace all 99 values with NA in dummynotenjoy
data4_stacklong <- data4_stacklong %>%
  mutate(dummynotenjoy = ifelse(dummynotenjoy == 99, NA_real_, dummynotenjoy))

# Replace all 99 values with NA in prev_dummynotenjoy
data4_stacklong <- data4_stacklong %>%
  mutate(prev_dummynotenjoy = ifelse(prev_dummynotenjoy == 99, NA_real_, prev_dummynotenjoy))
#
#### 2.6. New variable: Days since last survey invite ####
# caclulated as 'days since last invite' = invitedate for current wave - invite date of previous/last survey taken park
#
data4_stacklong <- data4_stacklong %>%
  mutate(invitedate = as.Date(invitedate, format = "%Y-%m-%d"),
         surveydate = as.Date(surveydate, origin = "1970-01-01", format = "%Y-%m-%d"))
data4_stacklong <- data4_stacklong %>%
  group_by(InternalSerialNumber) %>%
  arrange(InternalSerialNumber, ind_sequence) %>%
  mutate(prev_invitedate = lag(invitedate),
         days_since_lastinvite = as.numeric(invitedate - prev_invitedate)) %>%
  ungroup()
#
#### 2.7. New variable: Days since last survey invite (categorical) ####
# Calculate quantiles
quantiles <- quantile(data4_stacklong$days_since_lastinvite, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)

# Define category labels
labels <- c("4to29days", "30to68days", "69to112days", "Over112days")

# Create categorical variable
data4_stacklong$category_days_since_lastinvite <- cut(data4_stacklong$days_since_lastinvite, 
                                                      breaks = quantiles, 
                                                      labels = labels,
                                                      include.lowest = TRUE)
#
# dummies
data4_stacklong$days_since_lastinvite_4to29 <- ifelse(data4_stacklong$category_days_since_lastinvite=="4to29days",1,0)
data4_stacklong$days_since_lastinvite_30to68 <- ifelse(data4_stacklong$category_days_since_lastinvite=="30to68days",1,0)
data4_stacklong$days_since_lastinvite_69to112 <- ifelse(data4_stacklong$category_days_since_lastinvite=="69to112days",1,0)
data4_stacklong$days_since_lastinvite_over112 <- ifelse(data4_stacklong$category_days_since_lastinvite=="Over112days",1,0)
#
#### 2.8. Lagged proportion nonenjoyment - new variable, excludes last notenjoy ####
#
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, invitedate) %>%
  group_by(InternalSerialNumber) %>%
  mutate(
    count_prev_dummynotenjoy = cumsum(!is.na(prev_dummynotenjoy) & prev_dummynotenjoy %in% c(0, 1)),
    sum_prev_dummynotenjoy = cumsum(ifelse(!is.na(prev_dummynotenjoy) & prev_dummynotenjoy %in% c(1), prev_dummynotenjoy, 0)),
    prop_lagged_dummynotenjoy = ifelse(count_prev_dummynotenjoy > 0, round(sum_prev_dummynotenjoy / count_prev_dummynotenjoy, 2), NA),
    prop_lagged_dummynotenjoy_excl_last = lag(prop_lagged_dummynotenjoy)
  )
#
#### 2.9. New variable: days since recruitment survey ####
#
library(lubridate)
data4_stacklong <- data4_stacklong %>%
  group_by(InternalSerialNumber) %>%
  mutate(first_sampled = if_else(ind_sequence == 1, ymd(invitedate), NA_Date_)) %>%
  ungroup() %>%
  mutate(across(ends_with("Date"), ymd)) %>%
  mutate(days_since_recruit = as.integer(difftime(first_sampled, RS_DataCollectionDate , units = "days"))) %>%
  arrange(InternalSerialNumber, ind_sequence) %>% 
  fill(first_sampled, days_since_recruit, .direction = "down")

# Calculate quantiles
quantiles <- quantile(data4_stacklong$days_since_recruit, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)

# Define category labels
labels <- c("Under66days", "66to99days", "100to241days", "Over241days")

# Create categorical variable
data4_stacklong$days_since_recruit_cat <- cut(data4_stacklong$days_since_recruit, 
                                                      breaks = quantiles, 
                                                      labels = labels,
                                                      include.lowest = TRUE)
#
# dummies
data4_stacklong$days_since_recruit_under66 <- ifelse(data4_stacklong$days_since_recruit_cat=="Under66days",1,0)
data4_stacklong$days_since_recruit_66to99 <- ifelse(data4_stacklong$days_since_recruit_cat=="66to99days",1,0)
data4_stacklong$days_since_recruit_100to241 <- ifelse(data4_stacklong$days_since_recruit_cat=="100to241days",1,0)
data4_stacklong$days_since_recruit_over241 <- ifelse(data4_stacklong$days_since_recruit_cat=="Over241days",1,0)
#
#### 2.10. Previous telephone mode of completion - new variable ####
#
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, ind_sequence) %>%
  group_by(InternalSerialNumber) %>%
  mutate(prev_phonemode = lag(phonemode)) %>%
  ungroup()
#
#### 2.11. Previous survey length in minutes - based on original minutes ####
#
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, ind_sequence) %>%
  group_by(InternalSerialNumber) %>%
  mutate(prev_minutes = lag(minutes)) %>%
  ungroup()
#
#### 2.12. Previous survey length in minutes - based on nooutlminutes ####
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, ind_sequence) %>%
  group_by(InternalSerialNumber) %>%
  mutate(prev_nooutlminutes = lag(nooutlminutes)) %>%
  ungroup()
#
#### 2.12.1. prev_nooutlminutes into quintiles ####
#
# Calculate quintiles
quintiles_prev_nooutlminutes <- quantile(data4_stacklong$prev_nooutlminutes, probs = seq(0, 1, by = 0.2), na.rm = TRUE)

# Create quintile variable
data4_stacklong <- data4_stacklong %>%
  mutate(prev_nooutlminutes_quintile = cut(prev_nooutlminutes, breaks = quintiles_prev_nooutlminutes, labels = FALSE, include.lowest = TRUE)) %>%
  mutate(
    prev_nooutlminutes_q1 = case_when(prev_nooutlminutes_quintile == 1 ~ 1, TRUE ~ 0),
    prev_nooutlminutes_q2 = case_when(prev_nooutlminutes_quintile == 2 ~ 1, TRUE ~ 0),
    prev_nooutlminutes_q3 = case_when(prev_nooutlminutes_quintile == 3 ~ 1, TRUE ~ 0),
    prev_nooutlminutes_q4 = case_when(prev_nooutlminutes_quintile == 4 ~ 1, TRUE ~ 0),
    prev_nooutlminutes_q5 = case_when(prev_nooutlminutes_quintile == 5 ~ 1, TRUE ~ 0)
  )

#### 2.13. Median_prev_survey - based on original minutes ####
#
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, ind_sequence) %>%
  group_by(InternalSerialNumber) %>%
  mutate(median_prev_survey = round(lag(median), 2)) %>%
  ungroup()
#
#### 2.13.1. Median_prev_survey quintiles ####
# Calculate quintiles
quintiles_median_prev_survey <- quantile(data4_stacklong$median_prev_survey, probs = seq(0, 1, by = 0.2), na.rm = TRUE)

# Create quintile variable
data4_stacklong <- data4_stacklong %>%
  mutate(median_prev_survey_quintile = cut(median_prev_survey, breaks = quintiles_median_prev_survey, labels = FALSE, include.lowest = TRUE)) %>%
  mutate(
    median_prev_survey_q1 = case_when(median_prev_survey_quintile == 1 ~ 1, TRUE ~ 0),
    median_prev_survey_q2 = case_when(median_prev_survey_quintile == 2 ~ 1, TRUE ~ 0),
    median_prev_survey_q3 = case_when(median_prev_survey_quintile == 3 ~ 1, TRUE ~ 0),
    median_prev_survey_q4 = case_when(median_prev_survey_quintile == 4 ~ 1, TRUE ~ 0),
    median_prev_survey_q5 = case_when(median_prev_survey_quintile == 5 ~ 1, TRUE ~ 0)
  )

#### 2.14. Nooutlmedian_prev_survey - based on nooutlminutes ####
#
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, ind_sequence) %>%
  group_by(InternalSerialNumber) %>%
  mutate(nooutlmedian_prev_survey = round(lag(nooutlmedian), 2)) %>%
  ungroup()
#
#### 2.15. New variable: lead_nonresponse ####
data4_stacklong <- data4_stacklong %>% 
  group_by(InternalSerialNumber) %>%
  arrange(ind_sequence) %>%
  mutate(lead_nonresponse = lead(nonresponse))
#
#### 2.16 New variable: Verian wave dummies in groups ####
#
data4_stacklong <- data4_stacklong %>%
  mutate(w1_dummy = ifelse(.time_1 == "w1", 1, 0),
         w2_dummy = ifelse(.time_1 == "w2", 1, 0),
         w4pew_dummy = ifelse(.time_1 == "w4pew", 1, 0),
         w16ofh_dummy = ifelse(.time_1 == "w16ofh", 1, 0),
         w18_dummy = ifelse(.time_1 == "w18", 1, 0),
         w20_dummy = ifelse(.time_1 == "w20", 1, 0),
         w22_dummy = ifelse(.time_1 == "w22", 1, 0),
         w23_dummy = ifelse(.time_1 == "w23", 1, 0),
         w23w2_dummy = ifelse(.time_1 == "w23w2", 1, 0),
         w23w3_dummy = ifelse(.time_1 == "w23w3", 1, 0))

# Arrange the dataset
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, ind_sequence)
#
#### 2.17. Dummies for weight stratum ####
#
#freq(data4_stacklong$WeightStratum, cum = TRUE, total = TRUE)
data4_stacklong$weight_stratum2 <- ifelse(data4_stacklong$WeightStratum==2,1,0)
data4_stacklong$weight_stratum3 <- ifelse(data4_stacklong$WeightStratum==3,1,0)
data4_stacklong$weight_stratum4 <- ifelse(data4_stacklong$WeightStratum==4,1,0)
data4_stacklong$weight_stratum5 <- ifelse(data4_stacklong$WeightStratum==5,1,0)
#
#### 2.18. New dummy for 3rd invite and later ####
#
# new dummy variable of whether 3rd or later survey invite
data4_stacklong <- data4_stacklong %>%
  mutate(third_or_later_invite = case_when(
    ind_sequence >= 3 ~ 1,
    ind_sequence == 2 ~ 0,
    TRUE ~ NA_real_
  ))
#
#### 2.19. Recode the lagged variables where ind_sequence=2 ####
#
# recode prop_lag_dummynotenj_new so that where a case has ind_sequence 2, then prop_lag_dummynotenj_new is 0. 
data4_stacklong <- data4_stacklong %>%
  mutate(prop_lag_dummynotenj_new = ifelse(ind_sequence == 2, 0, prop_lagged_dummynotenjoy_excl_last))
#
#### 2.20. Create dummies ind_sequence1-23
#
# Create dummy variables for ind_sequence
for (i in 1:23) {
  dummy_var_name <- paste0("ind_sequence", i)
  data4_stacklong <- data4_stacklong %>%
    mutate(!!dummy_var_name := ifelse(ind_sequence == i, 1, 0))
}
# Arrange the dataset
data4_stacklong <- data4_stacklong %>%
  arrange(InternalSerialNumber, ind_sequence)
#
#### 2.21. New variable: number of invites (including the current invite) ####
data4_stacklong <- data4_stacklong %>%
  group_by(InternalSerialNumber) %>%
  arrange(row_number()) %>%
  mutate(n_invites = cumsum(sampled))

##### Save the cleaned DataFrame back to the same RDS file ####
#
saveRDS(data4_stacklong, "data4_stacklong.rds")
# 